1. Download data (from Norberg et al. paper) and combine training and validation data sets into one dataset

2. Subset sites

Most people attempting to model the distribution of a rare species do not necessarily have 1200 sites worth of data. They often are focused on a smaller region. Additionally, using more than 1000 sites is not recommended for the full spatial joint HMSC model (In that case, either a GPP or NNGP are suggested as alternatives, but they present their own unique complications as either the number of neighbors or the number of knots to use have to be chosen in some fashion).

Thus, to address both of these issues, we decided to reduce the sites to a subset of 400-600 sites (as Norberg et al. did).

Here we choose the 600 most Southern sites (or the 400). Determining whether to use 400-600 will depend on figuring out whether only 400 sites allows the choice of \(\mu_{ext}\) to be far enough away from \(\mu_{avg}\) while still leaving enough sites that are within 1 standard deviation of \(\mu_{ext}\).

us <- map_data('state')

ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
geom_point(data=south, aes(x=long, y=lat), colour="red") +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))

colSums(south[,6:67])
##  sp1  sp2  sp3  sp4  sp5  sp6  sp7  sp8  sp9 sp10 sp11 sp12 sp13 sp14 sp15 sp16 
##   41  229   24   22   56   20   10   64   18   55   21   16  200   10   24   44 
## sp17 sp18 sp19 sp20 sp21 sp22 sp23 sp24 sp25 sp26 sp27 sp28 sp29 sp30 sp31 sp32 
##   18   71   12    4   10   43   12   16   28   15   70   11  121   21   25   21 
## sp33 sp34 sp35 sp36 sp37 sp38 sp39 sp40 sp41 sp42 sp43 sp44 sp45 sp46 sp47 sp48 
##  105   51   19   31   12   88   42   17   27   13   26   31   14   32   20    4 
## sp49 sp50 sp51 sp52 sp53 sp54 sp55 sp56 sp57 sp58 sp59 sp60 sp61 sp62 
##    8   31   57   22  124   17    1   50   17    8   11   11   56   20

Check to see how species are split

 cols <- brewer.pal(3, "Dark2")

temp <- dat

for(i in 1:1200){
  temp$N[i] <- sum(temp[i,6:68])
}

 ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
geom_point(data=temp, aes(x=long, y=lat, color=N, size=N),shape=19, alpha=0.8) + scale_color_viridis() +
   scale_size(range=c(0.01,3)) +
   coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -65), ylim=c(25, 50))

for (i in 1:63) {
  temp_south <- south[, c(1:5, i + 5)]
  temp_north <- north[, c(1:5, i + 5)]
  names(temp_south) <- c("long", "lat", "PC1", "PC2", "PC3", "SP")
  names(temp_north) <- c("long", "lat", "PC1", "PC2", "PC3", "SP")

  p <- ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
geom_point(data=temp_south[temp_south$SP==0,], aes(x=long, y=lat), colour=cols[1], shape=4, alpha=0.2, size=0.1) +
    geom_point(data=temp_south[temp_south$SP==1,], aes(x=long, y=lat), colour=cols[1], shape=19, alpha=1, size=1) +
    #North
    geom_point(data=temp_north[temp_north$SP==0,], aes(x=long, y=lat), colour=cols[2], shape=4, alpha=0.2, size=0.1) +
    geom_point(data=temp_north[temp_north$SP==1,], aes(x=long, y=lat), colour=cols[2], shape=19, alpha=1, size=1) +
    ggtitle(paste0("Species: ", i)) +
    coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -65), ylim=c(25, 50))
  
  plot(p)
}

How is niche-space split?

#PC1 x PC2
ggplot() + 
  geom_point(data=south, aes(x=PC1, y=PC2), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
  geom_point(data=north, aes(x=PC1, y=PC2),
             color=cols[2], shape=16, size=2, alpha=0.25)+
  xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC2]))

#PC1 x PC3
ggplot() + 
  geom_point(data=south, aes(x=PC1, y=PC3), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
  geom_point(data=north, aes(x=PC1, y=PC3),
             color=cols[2], shape=16, size=2, alpha=0.25)+
  xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC3]))

#PC2 x PC3
ggplot() + 
  geom_point(data=south, aes(x=PC2, y=PC3), color=cols[1], shape=16, size=2, alpha=0.25, show.legend=F) +
  geom_point(data=north, aes(x=PC2, y=PC3),
             color=cols[2], shape=16, size=2, alpha=0.25)+
  xlab(bquote(mu[PC2])) + ylab(bquote(mu[PC3]))

3. Determine individual species’ niche breadths as well as community weighted average of each PC.

X_bar <- data.frame(mu_PC1 = rep(0, 64),
                    sd_PC1 = rep(0, 64),
                    mu_PC2 = rep(0, 64),
                    sd_PC2 = rep(0, 64),
                    mu_PC3 = rep(0, 64),
                    sd_PC3 = rep(0, 64),
                    N = rep(NA, 64))

                  
rownames(X_bar) <- c(paste0("sp", 1:63), "Mean")

for (j in 1:63) {
  temp <-subset(south, south[,j+5]==1)
  X_bar$mu_PC1[j] <- mean(temp$PC1) #mu_PC1
  X_bar$sd_PC1[j] <- sd(temp$PC1) #sd_PC1
  X_bar$min_PC1[j] <- min(temp$PC1)
  X_bar$max_PC1[j] <- max(temp$PC1)
  
  X_bar$mu_PC2[j] <- mean(temp$PC2) #mu_PC2
  X_bar$sd_PC2[j] <- sd(temp$PC2) #sd_PC2
  X_bar$min_PC2[j] <- min(temp$PC2)
  X_bar$max_PC2[j] <- max(temp$PC2)
  
  X_bar$mu_PC3[j] <- mean(temp$PC3) #mu_PC3
  X_bar$sd_PC3[j] <- sd(temp$PC3) #sd_PC3
  X_bar$min_PC3[j] <- min(temp$PC3)
  X_bar$max_PC3[j] <- max(temp$PC3)
  X_bar$N[j] <- length(temp$PC1)
  
}

X_bar$mu_PC1[64] <- mean(X_bar$mu_PC1[1:63])
X_bar$sd_PC1[64] <-mean(X_bar$sd_PC1[1:63], na.rm=T)
X_bar$mu_PC2[64] <- mean(X_bar$mu_PC2[1:63])
X_bar$sd_PC2[64] <- mean(X_bar$sd_PC2[1:63], na.rm=T)
X_bar$mu_PC3[64] <- mean(X_bar$mu_PC3[1:63])
X_bar$sd_PC3[64] <- mean(X_bar$sd_PC3[1:63], na.rm=T)





#PC1 vs PC2
ggplot() + 
  geom_point(data=south, aes(x=PC1, y=PC2, color=PC3), shape=16, size=2, alpha=0.25, show.legend=F) +
  scale_color_continuous(type="viridis", limits=c(-10,3))+ 
  geom_point(data=X_bar[1:63,], aes(x=mu_PC1, y=mu_PC2,fill=mu_PC3), shape=21, size=2) +
  geom_point(data=X_bar[64,], aes(x=mu_PC1, y=mu_PC2, fill=mu_PC3), shape=21, size=5) + 
  scale_fill_continuous(type="viridis", limits=c(-10,3)) +
  xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC2])) +
  labs(fill=bquote(mu[PC3]))

#PC1 vs PC3
ggplot() +
   geom_point(data=south, aes(x=PC1, y=PC3, color=PC2), shape=16, size=2, alpha=0.25, show.legend=F) +
  scale_color_continuous(type="viridis", limits=c(-9,16))+
  geom_point(data=X_bar[1:63,], aes(x=mu_PC1, y=mu_PC3,fill=mu_PC2), shape=21, size=2) +
  geom_point(data=X_bar[64,], aes(x=mu_PC1, y=mu_PC3, fill=mu_PC2), shape=21, size=5) + 
  scale_fill_continuous(type="viridis", limits=c(-9,16)) +
  xlab(bquote(mu[PC1])) + ylab(bquote(mu[PC3])) +
  labs(fill=bquote(mu[PC2]))

#PC2 vs PC3
ggplot() + 
  geom_point(data=south, aes(x=PC2, y=PC3, color=PC1), shape=16, size=2, alpha=0.25, show.legend=F) +
  scale_color_continuous(type="viridis", limits=c(-9,13))+ 
  geom_point(data=X_bar[1:63,], aes(x=mu_PC2, y=mu_PC3,fill=mu_PC1), shape=21, size=2) +
  geom_point(data=X_bar[64,], aes(x=mu_PC2, y=mu_PC3, fill=mu_PC1), shape=21, size=5) + 
  scale_fill_continuous(type="viridis", limits=c(-9,13)) +
  xlab(bquote(mu[PC2])) + ylab(bquote(mu[PC3])) +
  labs(fill=bquote(mu[PC1]))

How does niche breadth vary with abundance?

ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC1), shape=16)  + ylab(bquote(sigma[PC1])) +
  geom_hline(yintercept=quantile(X_bar$sd_PC1, probs=0.10, na.rm=T), linetype="dashed") +
  geom_hline(yintercept=quantile(X_bar$sd_PC1, probs=0.90, na.rm=T), linetype="dashed") +
  geom_hline(yintercept=X_bar$sd_PC1[64], linetype="dashed", col="red") 

ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC2), shape=16)  + ylab(bquote(sigma[PC2])) +
  geom_hline(yintercept=quantile(X_bar$sd_PC2, probs=0.10, na.rm=T), linetype="dashed") +
  geom_hline(yintercept=quantile(X_bar$sd_PC2, probs=0.9, na.rm=T), linetype="dashed") +
    geom_hline(yintercept=X_bar$sd_PC2[64], linetype="dashed", col="red") 

ggplot() + geom_point(data=X_bar[1:63,], aes(x=N, y=sd_PC3), shape=16)  + ylab(bquote(sigma[PC3])) +
  geom_hline(yintercept=quantile(X_bar$sd_PC3, probs=0.10, na.rm=T), linetype="dashed") +
  geom_hline(yintercept=quantile(X_bar$sd_PC3, probs=0.90, na.rm=T), linetype="dashed") +
    geom_hline(yintercept=X_bar$sd_PC3[64], linetype="dashed", col="red") 

tries_PC1 <- c(seq(from=-9, to=-2.5, by=0.5), seq(from=-2, to=1, by=0.5))
tries_PC3 <- c(seq(from=-10, to=-1, by=0.5), seq(from=1, to=3, by=0.5))
tries_sigma_narrow_PC1 <- seq(from=0.1, to=3, by=0.1)
tries_sigma_narrow_PC3 <- seq(from=0.1, to=3, by=0.1)

tries <- expand.grid(tries_PC1, tries_PC3, tries_sigma_narrow_PC1, tries_sigma_narrow_PC3)
names(tries) <- c(
"mu_PC1", "mu_PC3", 
"sd_PC1", "sd_PC3"
)

mu_avg <- c(X_bar$mu_PC1[64], X_bar$mu_PC3[64])
Sigma_wide <- diag(x=c(quantile(X_bar$sd_PC1, probs=0.90, na.rm=T)^2,  quantile(X_bar$sd_PC3, probs=0.90, na.rm=T)^2))



df_tries <- data.frame(
  mu_PC1 = tries$mu_PC1, 
  mu_PC3 = tries$mu_PC3,
  sd_PC1 = tries$sd_PC1,
  sd_PC3 = tries$sd_PC3,
  p128_narrow_avg = rep(NA, length(tries$mu_PC1)),
  p128_wide_ext = rep(NA,            length(tries$mu_PC1)),
  p128_narrow_ext = rep(NA,
   length(tries$mu_PC1)))





for (ii in 1:length(df_tries$mu_PC1)) {
  mu_ext <- c(df_tries$mu_PC1[ii], df_tries$mu_PC3[ii])
  Sigma_narrow <- diag(c(df_tries$sd_PC1[ii], df_tries$sd_PC3[ii]))
  
  L_narrow_avg <- dmvnorm(south[,c(3,5)],
  mean=mu_avg,
  sigma=Sigma_narrow )
  L_narrow_avg <- L_narrow_avg/sum(L_narrow_avg)
  L_narrow_avg <- sort(L_narrow_avg, decreasing=T)
  
  L_narrow_ext <- dmvnorm(south[,c(3,5)],
  mean=mu_ext,
  sigma=Sigma_narrow )
  L_narrow_ext <- L_narrow_ext/sum(L_narrow_ext)
  L_narrow_ext <- sort(L_narrow_ext, decreasing=T)
  
  L_wide_ext <- dmvnorm(south[,c(3,5)],
  mean=mu_ext,
  sigma=Sigma_wide )
  L_wide_ext <- L_wide_ext/sum(L_wide_ext)
  L_wide_ext <- sort(L_wide_ext, decreasing=T)
  
  df_tries$p128_narrow_avg[ii] <- L_narrow_avg[128]/max(L_narrow_avg)
  df_tries$p128_narrow_ext[ii] <- L_narrow_ext[128]/max(L_narrow_ext)
  df_tries$p128_wide_ext[ii] <-L_wide_ext[128]/max(L_wide_ext)

 if (ii %% 10000 == 0) { cat(paste0(ii, " of ", length(df_tries$mu_PC1), "\n"))
 }
}



save(df_tries, file="../data/df_tries.RData")
load("../data/df_tries.RData")



df_tries2 <- subset(df_tries, df_tries$p128_narrow_ext>0.5)
df_tries3 <- subset(df_tries2, df_tries2$p128_wide_ext>0.5)
df_tries4 <- subset(df_tries3, df_tries3$p128_narrow_avg>0.5)
min(df_tries4$sd_PC1)
## [1] 0.7
df_tries5 <- subset(df_tries4, df_tries4$sd_PC1 <1)
min(df_tries5$sd_PC3)
## [1] 2.5
df_tries6 <- df_tries5[df_tries5$sd_PC3<2.6,]
df_tries7 <- subset(df_tries6, df_tries6$mu_PC1 > -4)

df_tries7[order(df_tries7$p128_narrow_ext, decreasing=F),]
##        mu_PC1 mu_PC3 sd_PC1 sd_PC3 p128_narrow_avg p128_wide_ext
## 367323   -3.5    1.0    0.9    2.5       0.5025202     0.8216325
## 367365   -3.5    2.0    0.9    2.5       0.5025202     0.8472845
## 367344   -3.5    1.5    0.9    2.5       0.5025202     0.8615871
##        p128_narrow_ext
## 367323       0.5081342
## 367365       0.5427180
## 367344       0.5461169
mu_ext <- c(-3.5, 2)
Sigma_narrow <- diag(c(0.9, 2.5))

mu_avg <- c(X_bar$mu_PC1[64], X_bar$mu_PC3[64])
Sigma_wide <- diag(x=c(quantile(X_bar$sd_PC1, probs=0.90, na.rm=T)^2,  quantile(X_bar$sd_PC3, probs=0.90, na.rm=T)^2))

File structure: Model Type Species Type Sample Size - Each of the 30 replicates and 3 splits

4. Construct Suitability Surfaces

First: unidimensional surfaces

There are 4 scenarios: narrow niche (narrow) vs. wide niche (wide), and community-average (avg) vs extreme (extreme).

cols <- viridis(4)

#PC1

ggplot(data = data.frame(x = c(-9, 13)), aes(x)) +
   geom_histogram(data=south, aes(x=PC1, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
  geom_histogram(data=X_bar[1:63,], aes(x=mu_PC1, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
stat_function(fun = dnorm, n = 101, args = list(mu_avg[1], sd = sqrt(Sigma_wide[1,1])), aes(color = "a")) + 
  stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[1], sd = sqrt(Sigma_narrow[1,1])), aes(color="b")) +
  stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[1], sd = sqrt(Sigma_narrow[1,1]))) +
  stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[1], sd = sqrt(Sigma_wide[1,1]))) +
ylab("Density") + xlab("PC1") +
  scale_colour_manual(name = 'Case', guide='legend', 
         values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
  scale_fill_manual(name = "Mean Value", guide='legend', 
                    values=c("e"="grey", "f"="green"),
                    labels=c("Sites", "Species"))

# #PC2
# ggplot(data = data.frame(x = c(-9, 16)), aes(x)) +
#    geom_histogram(data=dat, aes(x=PC2, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
#   geom_histogram(data=X_bar[1:63,], aes(x=mu_PC2, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
# stat_function(fun = dnorm, n = 101, args = list(mu_avg[2], sd = sqrt(Sigma_wide[2,2])), aes(color = "a")) + 
#   stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[2], sd = sqrt(Sigma_narrow[2,2])), aes(color="b")) +
#   stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_narrow[2,2]))) +
#   stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_wide[2,2]))) +
# ylab("Density") + xlab("PC2") +
#   scale_colour_manual(name = 'Case', guide='legend', 
#          values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
#   scale_fill_manual(name = "Mean Value", guide='legend', 
#                     values=c("e"="grey", "f"="green"),
#                     labels=c("Sites", "Species"))
  
#PC3
ggplot(data = data.frame(x = c(-11, 4)), aes(x)) +
   geom_histogram(data=south, aes(x=PC3, y= ..density.., fill="e"), binwidth=1, alpha=0.5) +
  geom_histogram(data=X_bar[1:63,], aes(x=mu_PC3, y=..density.., fill="f"), binwidth=1, alpha=0.5)+
stat_function(fun = dnorm, n = 101, args = list(mu_avg[2], sd = sqrt(Sigma_wide[2,2])), aes(color = "a")) + 
  stat_function(fun = dnorm, n = 101, args = list(mean = mu_avg[2], sd = sqrt(Sigma_narrow[2,2])), aes(color="b")) +
  stat_function(fun = dnorm, aes(color="c"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_narrow[2,2]))) +
  stat_function(fun = dnorm, aes(color="d"), n = 101, args = list(mu_ext[2], sd = sqrt(Sigma_wide[2,2]))) +
ylab("Density") + xlab("PC3") +
  scale_colour_manual(name = 'Case', guide='legend', 
         values =c("a" = cols[1], "b" = cols[2], "c" = cols[3], "d"=cols[4]), labels = c('Wide - Avg','Narrow - Avg', "Narrow - Ext", "Wide - Ext")) +
  scale_fill_manual(name = "Mean Value", guide='legend', 
                    values=c("e"="grey", "f"="green"),
                    labels=c("Sites", "Species"))

Multivariate distribution

It is unlikely that species respond strongly to all three PCs. We can therefore ignore one of the PCs, for the sake of simplicity.

Here we will fix PC2 as irrelevant for determining habitat suitability for our simulated species, but we could have just as easily assumed one of the other two PCs was irrelevant.

#Case 1: Community-average, wide niche

x1 <- seq(-9, 2, length.out=100)
x3 <- seq(-10.5, 3.5, length.out=100)
my.palette <- brewer.pal(11, "RdYlBu")

df <- expand.grid(x1, x3)
names(df) <- c("PC1", "PC3")


df$L_wide_avg<- dmvnorm(x=df[,1:2], mean=mu_avg, sigma=Sigma_wide )
df$L_wide_ext<- dmvnorm(x=df[,1:2], mean=mu_ext, sigma=Sigma_wide )


df$L_narrow_avg<- dmvnorm(x=df[,1:2], mean=mu_avg, sigma=Sigma_narrow)
df$L_narrow_ext<- dmvnorm(x=df[,1:2], mean=mu_ext, sigma=Sigma_narrow)


#I. Wide Niche- Community Average
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) + 
  geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_wide_avg)) + 
  scale_fill_gradientn(colors=my.palette) + 
  theme(axis.text=element_text(size=15))+
  theme(axis.title=element_text(size=15)) +
  theme(title = element_text(size=15)) +
  theme(plot.title=element_text(hjust=0.5)) + 
    geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
  xlab("PC1") + ylab("PC3") +
  ggtitle("I. Wide Niche - Community Average")

#II. Wide Niche - Extreme Value
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) + 
  geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_wide_ext)) + 
  scale_fill_gradientn(colors=my.palette) + 
  theme(axis.text=element_text(size=15))+
    geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
  theme(axis.title=element_text(size=15)) +
  theme(title = element_text(size=15)) +
  theme(plot.title=element_text(hjust=0.5)) + 
  xlab("PC1") + ylab("PC3") +
  ggtitle("II. Wide Niche - Extreme Value")

#III. Narrow Niche - Community Average
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) + 
  geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_narrow_avg)) + 
  scale_fill_gradientn(colors=my.palette) + 
    geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
  theme(axis.text=element_text(size=15))+
  theme(axis.title=element_text(size=15)) +
  theme(title = element_text(size=15)) +
  theme(plot.title=element_text(hjust=0.5)) + 
  xlab("PC1") + ylab("PC3") +
  ggtitle("III. Narrow Niche - Community Average")

#IV. Narrow Niche - Extreme Value 
ggplot() + xlim(-9, 2) + ylim(-10.5, 3.5) + 
  geom_tile(data = df, aes(x=PC1, y = PC3, fill= L_narrow_ext)) + 
  scale_fill_gradientn(colors=my.palette) +
    geom_point(data=south, aes(x=PC1, y=PC3), shape=3, alpha=0.2, size=.8)+
  theme(axis.text=element_text(size=15))+
  theme(axis.title=element_text(size=15)) +
  theme(title = element_text(size=15)) +
  theme(plot.title=element_text(hjust=0.5)) + 
  xlab("PC1") + ylab("PC3") +
  ggtitle("IV. Narrow Niche - Extreme Value")

5. Generate Simulated Species

Let \(N \in \{2, 4, 8, 16, 32, 64\}\) denote the desired number of presences to be used in a training model. Steps for generating simulated species:

  1. Generate \(2 * N\) presences according to the habitat suitability function \(L_s\). Repeat for 30 replicates

  2. For each replicate, split the data into two folds such that each fold has \(N\) presences. Repeat for 3 data-splits.

    • For N = 2, there are \({4 \choose 2} = 6\) ways of splitting the data, of which only 3 are unique: 12/34, 13/24, 14/23.

    • Check that species are well-modeled:

      • For each of the data splits, calculate the probability of presence by adding up \(L\) for each of the sites at which the species is present. There will be 6 of these values for each of the 30 replicates for one species at each level of sample size.
      • For each sample-size x species category, make a histogram of the site IDs that are selected. Ideally there should be some variation here, in that each of the 30 replicates have different sites at which the species is present.
  3. Use the HMSC cross-validation function to run 2-fold cross-validation for each of the three data-splits. This will result in 3 estimates of AUC, RMSE and Tjur’s \(R^2\) that are averaged over the two folds within the split. Each model will be trained on a data set that has \(N\) presences, and the model will be evaluated using a heldout portion of the data that also has \(N\) presences.

  4. In order to evaluate the ability of the models to recover the response curves:

    • Run the model on folds (12), (34), (13), (24), (14), (23) to come up with 6 estimates of \(\rho\) for each of the \(\beta\) coefficients. Take the average. Calculate Pearson and Spearman. (Can do a logit transform to approximate normality)

For each species \(s\), for each sample size \(n\) and for each replicate \(r\):

sims[[s]][[n]][[r]] contains the following objects:

  • index: stores the locations of presences. For each species x replicate x sample size combo, there should be \(2*n\) presences

  • Ysim: Unit vector of length 600 (one for each site within the study). A site has value 1 if it is occupied and 0 otherwise.

  • L: Unit vector of length 600 (one for each site within the study). The values are the habitat suitability calculated using the multivariate normal distribution for species \(s\)

  • split1, split2, split3: Three vectors, each of dimension 600 x1. A given split indicates which fold (1 or 2) each site is in.

set.seed(12)
NSites <- length(south$long)
row.names(south) <- 1:NSites
replicates <- paste0("rep", 1:30)
NPresences <- c(64, 32, 16, 8, 4, 2)
sizes <- paste0("size", NPresences)# Desired sample size
species <- c("wide_avg", "wide_ext",
             "narrow_avg", "narrow_ext")
splits <- paste0("split", 1:3)


sims <- vector("list", length(species))
names(sims) <- species
for(s in 1:length(species)){
  sims[[s]] <- vector("list", length(NPresences))
  names(sims[[s]]) <- sizes
  for(n in 1:length(NPresences)) {
    sims[[s]][[n]] <- vector("list", length(replicates))
    names(sims[[s]][[n]]) <- replicates
    for(r in 1:length(replicates)) {
      sims[[s]][[n]][[r]] <- vector("list", length(splits))
      names(sims[[s]][[n]][[r]]) <- splits
    }
  }}   
    
    
   

for (s in 1:length(species)) {
      switch(species[s],
          "wide_avg"={
            sims[[s]]$L <-
              dmvnorm(south[,c(3,5)],
                      mean=mu_avg,
                      sigma=Sigma_wide )
            
          },
          "wide_ext" = {
            sims[[s]]$L <-
              dmvnorm(south[,c(3,5)],
                      mean = mu_ext,
                      sigma=Sigma_wide )
          },
            "narrow_avg" = {
            sims[[s]]$L <-
              dmvnorm(south[,c(3,5)],
                      mean = mu_avg,
                      sigma=Sigma_narrow)
          },
             "narrow_ext" = {
            sims[[s]]$L <-
             dmvnorm(south[,c(3,5)],
                      mean = mu_ext,
                      sigma=Sigma_narrow)
          })#end of switch
        
  for(n in 1:length(sizes)) {
   
    for(r in 1:length(replicates)){
        sims[[s]][[n]][[r]]$index <- matrix(ncol=(2*NPresences[n]))
        sims[[s]][[n]][[r]]$YSim <- matrix(0, nrow=NSites, ncol=1)
        sims[[s]][[n]][[r]]$YSim  <- data.frame(sims[[s]][[n]][[r]]$YSim )
names(sims[[s]][[n]][[r]]$YSim ) <- "SimSp"
        
             sims[[s]][[n]][[r]]$index <- sample(1:NSites, 2*NPresences[n], replace=F, prob=sims[[s]]$L/sum(sims[[s]]$L))
        
        for(site in 1:NSites) {
    if(site %in% sims[[s]][[n]][[r]]$index) {
      sims[[s]][[n]][[r]]$YSim[site,] <- 1
    }
        }
        
        #Split 1 = 12/34
        sites <- 1:NSites
        presences1 <- sims[[s]][[n]][[r]]$index[1:NPresences[n]]
        presences2 <-
          sims[[s]][[n]][[r]]$index[seq(from=1, to=2*NPresences[n], by=2)]
        thing <- rep(1:4, NPresences[n]/2)
        presences3 <- sims[[s]][[n]][[r]]$index[thing==1 | thing==4]
        
       NAbsences <- (NSites - 2*NPresences[n])/2
       absences <- sites[! sites %in% sims[[s]][[n]][[r]]$index]
       absences1 <- sample(absences, NAbsences, replace=F)
       absences2 <- sample(absences, NAbsences, replace=F)
       absences3 <- sample(absences, NAbsences, replace=F)
       one <- c(presences1, absences1)
       two <- c(presences2, absences2)
       three <- c(presences3, absences3)
       for (k in 1:NSites) {
        if(k %in% one) {
      sims[[s]][[n]][[r]]$split1[k] <- 1
        }
         else{sims[[s]][[n]][[r]]$split1[k] <- 2}
       }
       
       for (k in 1:NSites) {
        if(k %in% two) {
      sims[[s]][[n]][[r]]$split2[k] <- 1
        }
         else{sims[[s]][[n]][[r]]$split2[k] <- 2}
       }
       
       for (k in 1:NSites) {
        if(k %in% three) {
      sims[[s]][[n]][[r]]$split3[k] <- 1
        }
         else{sims[[s]][[n]][[r]]$split3[k] <- 2}
       }
       
       
  }
}
}


save(sims, file="../data/sims_data.RData")

Checking that simulation worked

Do the splits separate species into halves correctly?

Just pick one to look at, wide-average size 4 replicate 1:

load("../data/sims_data.RData")
replicates <- paste0("rep", 1:30)
NPresences <- c(64, 32, 16, 8, 4, 2)
sizes <- paste0("size", NPresences)# Desired sample size
species <- c("wide_avg", "wide_ext",
             "narrow_avg", "narrow_ext")
splits <- paste0("split", 1:3)
row.names(south) <- 1:474
temp <- south
temp$YSim  <- sims$wide_avg$size4$rep1$YSim
temp$YSim <- as.matrix(temp$YSim)
temp$YSim <- temp$YSim[,1]
temp$split1 <- sims$wide_avg$size4$rep1$split1
temp$split2 <- sims$wide_avg$size4$rep1$split2
temp$split3 <- sims$wide_avg$size4$rep1$split3

ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
  #Group 1
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split1), shape=21) +
  geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split1), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))

ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split2), shape=21) +
  geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split2), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))

ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, 
  group=group), colour="grey20", fill="grey80") +
geom_point(data=temp[temp$YSim==1,], aes(x=long, y=lat, fill=split3), shape=21) +
  geom_point(data=temp[temp$YSim==0,], aes(x=long, y=lat, color=split3), shape=4) +coord_map(projection = "albers", lat0=39, lat1=40, xlim=c(-100, -75), ylim=c(25, 40))

  • For each of the data splits, calculate the probability of presence by adding up \(L\) for each of the sites at which the species is present. There will be 6 of these values for each of the 30 replicates for one species at each level of sample size.

Are the presences selected such that sites with the highest probability of presence are selected more often?

par(mfrow=c(1,2))
for ( s in 1:4){
  for (n in 1:6){
    for (r in 1:4) {
hist(sims[[s]]$L, breaks=100, main=paste0(species[s], sizes[n], replicates[r]))
abline(v=sims[[s]]$L[sims[[s]][[n]][[r]]$YSim==1], col="red")
    }
  }
}

  + For each sample-size x species category, make a histogram of the site IDs that are selected. Ideally there should be some variation here, in that each of the 30 replicates have different sites at which the species is present.

Plot in niche space

(a) Wide Niche - Community Average

Note: For brevity, only the first 3 reps of the first 10 simulated species are shown